$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json

name: jsonl_knn_cosine_similarity
version: 0.0.1pre1
display_name: JSONL k-Nearest Neighbours Cosine Similarity
type: command
description: |
  Takes two JSONL files, 'input' and 'examples'.
  Given a key containing a vector in each file, for each line in the input:

  1. Compute the cosine similarity to every line in the examples
  2. Store the examples with the k largest values in the designated output key
is_deterministic: true

inputs:
  input_dataset:
    type: uri_file
    optional: false
    description: Dataset containing JSONL input
  input_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the input dataset
  example_dataset:
    type: uri_file
    optional: false
    description: Dataset containing JSONL example data
  example_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the example dataset
  output_encoding:
    type: string
    optional: false
    default: utf-8-sig
    description: Encoding format of the output dataset
  input_vector_key:
    type: string
    optional: false
    description: Key in the input dataset which contains the vector
  example_vector_key:
    type: string
    optional: false
    description: Key in the example dataset which contains the vector
  output_key:
    type: string
    optional: false
    description: Key in which to store the list of k-nearest neighbours
  k_nearest:
    type: integer
    optional: false
    description: How many neighbours to select
  
  
outputs:
  output_dataset:
    type: uri_file
    description: JSONL file containing inputs with k-nearest neighbours appended


code: ./src/

command: >-
  python ./jsonl_knn_cosine_similarity.py
  --input_dataset ${{ inputs.input_dataset }}
  --input_encoding ${{ inputs.input_encoding }}
  --example_dataset ${{ inputs.example_dataset }}
  --example_encoding ${{ inputs.example_encoding }}
  --output_dataset ${{ outputs.output_dataset }}
  --output_encoding ${{ inputs.output_encoding }}
  --input_vector_key ${{ inputs.input_vector_key }}
  --example_vector_key ${{ inputs.example_vector_key }}
  --output_key ${{ inputs.output_key }}
  --k_nearest ${{ inputs.k_nearest }}


environment:
  # Will be updated when component uploads
  image: azureml:promptbase_aml@latest
